#IMPORT (wrangled) data 
df_raw_insights <- read_csv("data/CLEAN_coded_utterances.csv") 
df_raw_telemetry <- read_csv("data/arf_CLEAN_telemetry_representations.csv") 

##PRIMARY UTTERANCE DF
#NOT unique utterances, 1 obs for each utterance+detail-code
df_coded <- df_raw_insights %>% 
  #rename and factorize cols
  mutate(
    #UNIQUE IDS
    sid = factor(SID), #unique ID for utterance+detail-code
    pid = factor(PID, levels = c( #define level order so happiness first
      #HAPPINESS-FIRST    
      "bjs827ee1u", "3r2sh20ei", "4728sjuiz","7ACC0B75","92ghd48xe","iurmer289", "s294hoei",
      #SPACE-FIRST    
      "j2719eertu2","lkin27js09b","li832lin23","7382kwtue","E1D39056","8v892iige")),   
    #create unique ID for utterances
    uid = factor(as.numeric(factor(paste(pid,factor(Utterance))))), #construct a unique ID for utterances
    #recode lower case and order based on true task order
    TASK = factor(recode(Condition, "Static"="static", "Interactive"="ixn" )),
    TASK = factor(TASK, levels = c("static", "ixn")), #reorder factor levels
    #rename Notebook as DATASET
    DATASET = factor(recode(Notebook, "Happiness"="happiness", "Space"="space")),
    #create temp dataset order var
    data_order = factor(paste(TASK,"_",DATASET)), #create an order var 
    data_order = recode(data_order, "ixn _ happiness"="space-first",
                                    "ixn _ space"="happiness-first",
                                    "static _ happiness"="happiness-first",
                                    "static _ space"="space-first"),
    utterance = Utterance,
    reps_group = factor(Final_Group),
    reps_all = factor(`All representations`),
    #rename flags
    flag_story = `Dylan Flag Storytelling`,
    flag_correction = `Dylan Flag Correction`,
    flag_simultaneous = `Dylan Flag Simultaneous Characterization`,
    #recode and order TOP LEVEL CODES 
    code_topic = factor(Highlevel),
    code_topic = recode(code_topic, "ANALYSIS PROCESS" = "PROCESS"),
    code_topic = factor(code_topic, levels = c("PROCESS","DATASET","VARIABLE","RELATIONSHIP")),
    code_datatype = factor(`Data Type`),
    code_detail = factor(`Utterance Type`),
    #collapse two detail codes due to sparsity (less than 3 obs)
    #collapse dist.var -> dist shape 
    #collapse rel faceted -> rel strength
    code_detail = recode(code_detail, 
                         "distribution variance (sd, var)" = "distribution shape [shape, skew, kurtosis]",
                         "relationship faceted distribution characterization" = "relationship strength and/or direction"  
                         ),
    timestamp = adj_timestamp,
    ixn = factor(interaction_used), #was interaction used?
    PNUM = factor(PNUM,levels = c("P6", "P9", "P10", "P2", "P4", "P12","P13", 
                                   "P5", "P7", "P8", "P3", "P1","P11")),
    
    ) %>% 
  dplyr::select(sid,pid,PNUM,uid,TASK,DATASET,timestamp,ixn,code_topic,code_detail,code_datatype,
         flag_story, flag_correction, flag_simultaneous, utterance, reps_group, reps_all, data_order) %>% 
  arrange(data_order)

#REPLACE NA in logicals to FALSE  
df_coded$flag_story[is.na(df_coded$flag_story)] <- FALSE
df_coded$flag_correction[is.na(df_coded$flag_correction)] <- FALSE
df_coded$flag_simultaneous[is.na(df_coded$flag_simultaneous)] <- FALSE



##NOW WRANGLE TIME INFO
# #CALCULATE RELATIVE TASK TIMES
df_coded <- df_coded %>% mutate(
  time = hms::as_hms(timestamp)
) %>% group_by(pid, TASK) %>%
  # dplyr::summarise( .groups="keep",
  mutate(
    task_start = hms::as_hms(min(time)),
    task_end = hms::as_hms(max(time)),
    task_mins = round(difftime(task_end,task_start, units="mins"),1),
    task_second = task_end - task_start,
    relative_time_s = timestamp-task_start,
    relative_time = as.double(relative_time_s)
  ) %>% ungroup() 
# %>%  dplyr::select(pid,PNUM, code_topic,code_detail, TASK,DATASET,timestamp,task_start,relative_time_s,relative_time)


##JOINED REPRESENATIONS DURING UTTERANCES DF
#START with REPRESENTATIONS associated with UTTERANCES
#ROW is unique utterance + rep combination
#many-many relationship between utterances and representations
df_joined <- df_coded %>% 
  dplyr::select(-data_order,-utterance, -flag_story, -flag_correction) %>% 
  mutate(
    #replace "data_dictionary" with dictionary
    #rename "Multi-view Chart"          
    reps_group  = factor(str_replace(reps_group, "data_dictionary", "dictionary")),
    reps_group  = factor(str_replace(reps_group, "Multi-view Chart", "multiviewchart")),                         
    reps_multi  = str_detect(reps_group,"_")) %>%  #flag multiple-representations 
  separate_longer_delim(reps_group, delim = "_") %>% #pivot longer based _ in reps_group
  mutate (REP = factor(reps_group)) %>% 
  dplyr::select(-reps_group) %>%  #drop reps_group column since now separated
  mutate(
    rep_type = recode(REP, 
                      "hist" = "CHART",                      
                      "profile" = "CHART",                   
                      "scatterplot" = "CHART",               
                      "barplot" = "CHART",                   
                      "stripplot" = "CHART",                 
                      "lineplot" = "CHART",                     
                      "heatmap" = "CHART",                    
                      "pairplot" = "CHART",                   
                      "multiviewchart" = "CHART",           
                      "double-profiler" = "CHART",           
                      "profile" = "CHART", 
                      "python" = "CODE",                    
                      "dictionary" = "CODE",   
                      "describe" = "CODE",
                      "dataframe" = "CODE",         
                      "info" = "CODE",                      
                      "columns" = "CODE",                  
                      "none"  ="NONE"     
  ))

##PRIMARY REPRESENATION DF
#ROW ==? 
df_telemetry <- df_raw_telemetry %>% 
  mutate(
  
    #PARTICIPANT DATA
    pid = factor(pID, levels = c( #define level order so happiness first
      #HAPPINESS-FIRST    
      "bjs827ee1u", "3r2sh20ei", "4728sjuiz","7ACC0B75","92ghd48xe","iurmer289", "s294hoei",
      #SPACE-FIRST    
      "j2719eertu2","lkin27js09b","li832lin23","7382kwtue","E1D39056","8v892iige")),   
    PNUM = factor(PNUM,levels = c("P6", "P9", "P10", "P2", "P4", "P12","P13", 
                                   "P5", "P7", "P8", "P3", "P1","P11")),
    

    
    TASK = factor(TASK, levels = c("static", "ixn")), #reorder factor levels
    DATASET = factor(session, levels = c("happiness","space")),
    #create temp dataset order var
    data_order = factor(paste(TASK,"_",DATASET)), #create an order var 
    data_order = recode(data_order, "ixn _ happiness"="space-first",
                                    "ixn _ space"="happiness-first",
                                    "static _ happiness"="happiness-first",
                                    "static _ space"="space-first"),
    timestamp = timestamp,
    time_elapsed = time_elapsed,
    technique = factor(`Interaction_Techniques`),
    IXN = (technique!="none"),
                         
    code = cell_content,
    REP = factor(merged_output_type),
    REP = factor(str_replace(REP, "Multi-view Chart", "multiviewchart"))) %>% 
  filter( #EXCLUDE SOME REPS
    #none and other are python code not of the tabular forms we look for
    REP %nin% c("none","error","markdown","other")
  ) %>% mutate(
    REP = recode_factor(REP, 
               "double-profiler" = "profile",
               "countplot" = "barplot"),
    REP = factor(REP), #reset factor levels
    rep_type = recode(REP,
                      # "dictionary" = "TABLE",   
                      "describe" = "TABLE",
                      "dataframe" = "TABLE",         
                      "info" = "TABLE",                      
                      "columns" = "TABLE",
                      "hist" = "CHART",                      
                      "profile" = "CHART",                   
                      "double-profiler" = "CHART",           
                      "countplot" = "CHART",
                      "barplot" = "CHART",                   
                      "scatterplot" = "CHART",               
                      "lineplot" = "CHART",                     
                      "stripplot" = "CHART",                 
                      "pairplot" = "CHART",                   
                      "heatmap" = "CHART",                    
                      "multiviewchart" = "CHART"
                      )) %>%      
  dplyr::select(pid,PNUM,TASK,DATASET,data_order,timestamp,time_elapsed,technique,IXN,REP,rep_type,cell_content)

1 UTTERANCES PROFILE

There are 742 rows in the df_coded dataset, where each row represents an utterance coding (i.e. utterance + detail code). There are 662 unique utterances. The difference indicates utterances that were dual-coded (i.e. two detail-level codes). No more than two codes were applied to a single utterance. For the purposes of analysis, dual-coded utterances will be treated as two utterances, as they have two distinct (but lexically insepeperable) units of meaning.

df_coded%>% summarytools::dfSummary(
             plain.ascii  = FALSE,
             graph.magnif = 0.75,
             style        = "grid",
             tmp.img.dir  = "temp",
             missing.col = FALSE, 
             method = "render"
)

1.0.1 Data Frame Summary

1.0.1.1 df_coded

Dimensions: 742 x 25
Duplicates: 0

No Variable Stats / Values Freqs (% of Valid) Graph Valid Missing
1 sid
[factor]
1. 0
2. 1
3. 2
4. 3
5. 4
6. 5
7. 6
8. 7
9. 8
10. 9
[ 732 others ]
1 ( 0.1%)
1 ( 0.1%)
1 ( 0.1%)
1 ( 0.1%)
1 ( 0.1%)
1 ( 0.1%)
1 ( 0.1%)
1 ( 0.1%)
1 ( 0.1%)
1 ( 0.1%)
732 (98.7%)
742
(100.0%)
0
(0.0%)
2 pid
[factor]
1. bjs827ee1u
2. 3r2sh20ei
3. 4728sjuiz
4. 7ACC0B75
5. 92ghd48xe
6. iurmer289
7. s294hoei
8. j2719eertu2
9. lkin27js09b
10. li832lin23
[ 3 others ]
29 ( 3.9%)
103 (13.9%)
43 ( 5.8%)
28 ( 3.8%)
56 ( 7.5%)
87 (11.7%)
88 (11.9%)
82 (11.1%)
48 ( 6.5%)
51 ( 6.9%)
127 (17.1%)
742
(100.0%)
0
(0.0%)
3 PNUM
[factor]
1. P6
2. P9
3. P10
4. P2
5. P4
6. P12
7. P13
8. P5
9. P7
10. P8
[ 3 others ]
29 ( 3.9%)
103 (13.9%)
43 ( 5.8%)
28 ( 3.8%)
56 ( 7.5%)
87 (11.7%)
88 (11.9%)
82 (11.1%)
48 ( 6.5%)
51 ( 6.9%)
127 (17.1%)
742
(100.0%)
0
(0.0%)
4 uid
[factor]
1. 1
2. 2
3. 3
4. 4
5. 5
6. 6
7. 7
8. 8
9. 9
10. 10
[ 652 others ]
2 ( 0.3%)
1 ( 0.1%)
1 ( 0.1%)
1 ( 0.1%)
1 ( 0.1%)
1 ( 0.1%)
1 ( 0.1%)
2 ( 0.3%)
1 ( 0.1%)
1 ( 0.1%)
730 (98.4%)
742
(100.0%)
0
(0.0%)
5 TASK
[factor]
1. static
2. ixn
403 (54.3%)
339 (45.7%)
742
(100.0%)
0
(0.0%)
6 DATASET
[factor]
1. happiness
2. space
431 (58.1%)
311 (41.9%)
742
(100.0%)
0
(0.0%)
7 timestamp
[hms, difftime]
min : 622
med : 2857
max : 6900
units : secs
622 distinct values 742
(100.0%)
0
(0.0%)
8 ixn
[factor]
1. FALSE
2. TRUE
633 (85.3%)
109 (14.7%)
742
(100.0%)
0
(0.0%)
9 code_topic
[factor]
1. PROCESS
2. DATASET
3. VARIABLE
4. RELATIONSHIP
160 (21.6%)
176 (23.7%)
122 (16.4%)
284 (38.3%)
742
(100.0%)
0
(0.0%)
10 code_detail
[factor]
1. data orientation
2. data provenance
3. data size
4. distribution outlier (var
5. distribution range [min,
6. distribution shape [shape
7. missing data
8. outlier (relationship)
9. plan of action
10. relationship cluster(s)/s
[ 6 others ]
16 ( 2.2%)
11 ( 1.5%)
9 ( 1.2%)
9 ( 1.2%)
33 ( 4.4%)
80 (10.8%)
76 (10.2%)
20 ( 2.7%)
52 ( 7.0%)
29 ( 3.9%)
407 (54.9%)
742
(100.0%)
0
(0.0%)
11 code_datatype
[factor]
1. distribution (continuous
2. distribution (categorical
3. relationship (categorical
4. relationship (categorical
5. relationship (continuous
6. relationship (multivariat
76 (17.8%)
54 (12.7%)
28 ( 6.6%)
55 (12.9%)
146 (34.3%)
67 (15.7%)
426
(57.4%)
316
(42.6%)
12 flag_story
[logical]
1. FALSE
2. TRUE
700 (94.3%)
42 ( 5.7%)
742
(100.0%)
0
(0.0%)
13 flag_correction
[logical]
1. FALSE
2. TRUE
733 (98.8%)
9 ( 1.2%)
742
(100.0%)
0
(0.0%)
14 flag_simultaneous
[logical]
1. FALSE
2. TRUE
682 (91.9%)
60 ( 8.1%)
742
(100.0%)
0
(0.0%)
15 utterance
[character]
1. [Talking about the profil
2. actually, let me see if p
3. Although we have like les
4. And are they within range
5. And confidence in governm
6. And just I want to see ho
7. And so it looks like it s
8. And then if I had more ti
9. Because it does seem like
10. Data frame. Got a bunch o
[ 652 others ]
2 ( 0.3%)
2 ( 0.3%)
2 ( 0.3%)
2 ( 0.3%)
2 ( 0.3%)
2 ( 0.3%)
2 ( 0.3%)
2 ( 0.3%)
2 ( 0.3%)
2 ( 0.3%)
722 (97.3%)
742
(100.0%)
0
(0.0%)
16 reps_group
[factor]
1. barplot
2. columns
3. columns_data_dictionary
4. data_dictionary
5. data_dictionary_dataframe
6. data_dictionary_describe
7. dataframe
8. dataframe_describe
9. dataframe_heatmap
10. dataframe_pairplot
[ 15 others ]
16 ( 2.2%)
4 ( 0.5%)
1 ( 0.1%)
56 ( 7.5%)
1 ( 0.1%)
9 ( 1.2%)
76 (10.2%)
1 ( 0.1%)
1 ( 0.1%)
1 ( 0.1%)
576 (77.6%)
742
(100.0%)
0
(0.0%)
17 reps_all
[factor]
1. affect_corruption_brush_7
2. Age_CryoSleep_scatterplot
3. age_CryoSleep_ShoppingMal
4. Age_RoomService_scatterpl
5. age_roomservice_scatterpl
6. Age_RoomService_scatterpl
7. Age_ShoppingMall_scatterp
8. altair_profile_contVars_j
9. alx_barplot_df_homeplanet
10. alx_barplot_df_homeplanet
[ 245 others ]
4 ( 0.6%)
3 ( 0.4%)
1 ( 0.1%)
1 ( 0.1%)
1 ( 0.1%)
1 ( 0.1%)
1 ( 0.1%)
1 ( 0.1%)
2 ( 0.3%)
1 ( 0.1%)
693 (97.7%)
709
(95.6%)
33
(4.4%)
18 data_order
[factor]
1. space-first
2. happiness-first
308 (41.5%)
434 (58.5%)
742
(100.0%)
0
(0.0%)
19 time
[hms, difftime]
min : 622
med : 2857
max : 6900
units : secs
622 distinct values 742
(100.0%)
0
(0.0%)
20 task_start
[hms, difftime]
min : 622
med : 2123
max : 5349
units : secs
26 distinct values 742
(100.0%)
0
(0.0%)
21 task_end
[hms, difftime]
min : 1811
med : 3815
max : 6900
units : secs
26 distinct values 742
(100.0%)
0
(0.0%)
22 task_mins
[difftime]
min : 6.5
med : 24.1
max : 28.5
units : mins
26 distinct values 742
(100.0%)
0
(0.0%)
23 task_second
[difftime]
min : 388
med : 1449
max : 1712
units : secs
26 distinct values 742
(100.0%)
0
(0.0%)
24 relative_time_s
[difftime]
min : 0
med : 552
max : 1712
units : secs
506 distinct values 742
(100.0%)
0
(0.0%)
25 relative_time
[numeric]
Mean (sd) : 616.1 (459)
min < med < max:
0 < 552 < 1712
IQR (CV) : 794.8 (0.7)
506 distinct values 742
(100.0%)
0
(0.0%)

ARF has reviewed data profile for missing data and correct factorization.

2 REPRESENTATIONS PROFILE

There are 504 rows in the df_telemetry dataset, where each row represents TODO

There are TODO 662 unique utterances. The difference indicates utterances that were dual-coded (i.e. two detail-level codes). No more than two codes were applied to a single utterance. For the purposes of analysis, dual-coded utterances will be treated as two utterances, as they have two distinct (but lexically insepeperable) units of meaning.

df_telemetry%>% summarytools::dfSummary(
             plain.ascii  = FALSE,
             graph.magnif = 0.75,
             style        = "grid",
             tmp.img.dir  = "temp",
             missing.col = FALSE, 
             method = "render"
)

2.0.1 Data Frame Summary

2.0.1.1 df_telemetry

Dimensions: 504 x 12
Duplicates: 16

No Variable Stats / Values Freqs (% of Valid) Graph Valid Missing
1 pid
[factor]
1. bjs827ee1u
2. 3r2sh20ei
3. 4728sjuiz
4. 7ACC0B75
5. 92ghd48xe
6. iurmer289
7. s294hoei
8. j2719eertu2
9. lkin27js09b
10. li832lin23
[ 3 others ]
43 ( 8.5%)
41 ( 8.1%)
33 ( 6.5%)
19 ( 3.8%)
31 ( 6.2%)
27 ( 5.4%)
31 ( 6.2%)
39 ( 7.7%)
15 ( 3.0%)
65 (12.9%)
160 (31.7%)
504
(100.0%)
0
(0.0%)
2 PNUM
[factor]
1. P6
2. P9
3. P10
4. P2
5. P4
6. P12
7. P13
8. P5
9. P7
10. P8
[ 3 others ]
43 ( 8.5%)
41 ( 8.1%)
33 ( 6.5%)
19 ( 3.8%)
31 ( 6.2%)
27 ( 5.4%)
31 ( 6.2%)
39 ( 7.7%)
15 ( 3.0%)
65 (12.9%)
160 (31.7%)
504
(100.0%)
0
(0.0%)
3 TASK
[factor]
1. static
2. ixn
232 (46.0%)
272 (54.0%)
504
(100.0%)
0
(0.0%)
4 DATASET
[factor]
1. happiness
2. space
301 (59.7%)
203 (40.3%)
504
(100.0%)
0
(0.0%)
5 data_order
[factor]
1. space-first
2. happiness-first
279 (55.4%)
225 (44.6%)
504
(100.0%)
0
(0.0%)
6 timestamp
[numeric]
Min : 1.68e+12
Mean : 1683018480492.8
Max : 1.69e+12
1.68e+12 : 340 (69.8%)
1.69e+12 : 147 (30.2%)
487
(96.6%)
17
(3.4%)
7 time_elapsed
[numeric]
Mean (sd) : 745818.9 (466847.7)
min < med < max:
1948 < 736053 < 1673757
IQR (CV) : 776174 (0.6)
398 distinct values 398
(79.0%)
106
(21.0%)
8 technique
[factor]
1. filter_brush
2. filter_brush+filter_slide
3. filter_brush+filter_slide
4. filter_brush+filter_slide
5. filter_slider
6. filter_slider+highlight_b
7. filter_slider+highlight_b
8. filter_slider+highlight_b
9. filter_slider+highlight_p
10. filter_slider+pan_zoom+to
[ 11 others ]
11 ( 2.2%)
2 ( 0.4%)
2 ( 0.4%)
1 ( 0.2%)
11 ( 2.2%)
5 ( 1.0%)
2 ( 0.4%)
10 ( 2.0%)
2 ( 0.4%)
4 ( 0.8%)
454 (90.1%)
504
(100.0%)
0
(0.0%)
9 IXN
[logical]
1. FALSE
2. TRUE
337 (66.9%)
167 (33.1%)
504
(100.0%)
0
(0.0%)
10 REP
[factor]
1. profile
2. barplot
3. columns
4. dataframe
5. describe
6. heatmap
7. hist
8. info
9. lineplot
10. multiviewchart
[ 3 others ]
42 ( 8.3%)
43 ( 8.5%)
30 ( 6.0%)
101 (20.0%)
20 ( 4.0%)
7 ( 1.4%)
19 ( 3.8%)
6 ( 1.2%)
18 ( 3.6%)
43 ( 8.5%)
175 (34.7%)
504
(100.0%)
0
(0.0%)
11 rep_type
[factor]
1. CHART
2. TABLE
347 (68.8%)
157 (31.2%)
504
(100.0%)
0
(0.0%)
12 cell_content
[character]
1. df.head()
2. df
3. df.columns
4. df.describe()
5. df.info()
6. columns = df.columns # re
7. df.isna()
8. df.isnull()
9. alx.pairplot(df)
10. alx.lineplot(df[df.countr
[ 374 others ]
19 ( 3.8%)
18 ( 3.6%)
10 ( 2.0%)
9 ( 1.8%)
7 ( 1.4%)
5 ( 1.0%)
5 ( 1.0%)
5 ( 1.0%)
4 ( 0.8%)
3 ( 0.6%)
419 (83.1%)
504
(100.0%)
0
(0.0%)

TODO ARF has reviewed data profile for missing data and correct factorization BUT wrangling needed on technique (pivotlonger) + need to normalize REP across barplot+countplot

3 UTTERANCES

Utterances are the lowest-level discrete units of meaning transcribed from the EDA Task transcripts. Utterances are coded at two levels of analysis: (1) topic-code gives a high level topic of the participant’s verbalization, (2) detail-code gives the lower level detail of the subject.

In the following subsections we explore the distribution of number of utterances based on TASK, DATASET, and PARTICIPANT, before describing the distribution of utterances through the timecourse of the TASK.

3.1 [Aggregated] Utterances

FIRST we explore the distribution of utterances by Analysis Task, Dataset, Participant and Time, irrespective of what the utterance was about (topic, detail).

RQ: How much did participants talk aloud during EDA? When did they talk aloud?

Answer: Inspection of frequency tables and visualizations suggests that the most substantial determinant of how many utterances an individual made is individual participant-level differences, rather than structural differences imposed by the TASK or DATASET. This is not altogether unexpected given the fact that across both tasks (static/interactive) and datasets the structure of the experimental task was the same

3.1.1 by TASK

print("BY TASK")

[1] “BY TASK”

freq(df_coded$TASK, 
     cumul      = FALSE,
     headings   = FALSE,
     report.nas = FALSE,
     plain.ascii = FALSE) 
  Freq %
static 403 54.31
ixn 339 45.69
Total 742 100.00

3.1.2 by TASK and DATASET

#COUNT BY TASK AND DATASET
ctable(x = df_coded$TASK, 
       y = df_coded$DATASET, 
       prop = "t")  

Cross-Tabulation, Total Proportions
TASK * DATASET
Data Frame: df_coded

DATASET happiness space Total
TASK
static 263 (35.4%) 140 (18.9%) 403 ( 54.3%)
ixn 168 (22.6%) 171 (23.0%) 339 ( 45.7%)
Total 431 (58.1%) 311 (41.9%) 742 (100.0%)
#DF SUMMARIZED BY TASK + DATASET
df_summary <- df_coded %>% 
  group_by(TASK,DATASET) %>% 
  dplyr::summarise(
    c = n()
  )

#STACKED BAR BY TASK
ggplot(df_summary, aes(x = TASK, y=c, fill= DATASET)) + 
  geom_col() + 
  geom_text(aes(label=c), size = 3, hjust = 0.5, vjust = 1.5, position = "stack") + 
  # scale_fill_brewer(type="qual", palette = 4) +
  labs( title = "Utterances by TASK and DATASET",
        subtitle = "More utterances in STATIC; more utterances in HAPPINESS",
        x= "TASK", y = "count") + theme_minimal() 

# + theme(legend.position = "blank")

3.1.3 by PARTICIPANT

#COUNT BY PARTICIPANT AND TASK
ctable(x = df_coded$PNUM, 
       y = df_coded$TASK, 
       prop = "r")  

Cross-Tabulation, Row Proportions
PNUM * TASK
Data Frame: df_coded

TASK static ixn Total
PNUM
P6 11 (37.9%) 18 (62.1%) 29 (100.0%)
P9 63 (61.2%) 40 (38.8%) 103 (100.0%)
P10 30 (69.8%) 13 (30.2%) 43 (100.0%)
P2 18 (64.3%) 10 (35.7%) 28 (100.0%)
P4 35 (62.5%) 21 (37.5%) 56 (100.0%)
P12 46 (52.9%) 41 (47.1%) 87 (100.0%)
P13 60 (68.2%) 28 (31.8%) 88 (100.0%)
P5 33 (40.2%) 49 (59.8%) 82 (100.0%)
P7 29 (60.4%) 19 (39.6%) 48 (100.0%)
P8 17 (33.3%) 34 (66.7%) 51 (100.0%)
P3 24 (44.4%) 30 (55.6%) 54 (100.0%)
P1 10 (40.0%) 15 (60.0%) 25 (100.0%)
P11 27 (56.2%) 21 (43.8%) 48 (100.0%)
Total 403 (54.3%) 339 (45.7%) 742 (100.0%)
#UTTERANCES by PARTICPANT facet TASK
gf_bar( PNUM ~., fill = ~ DATASET, data = df_coded) %>% 
  gf_facet_grid(.~TASK) + 
  labs(
    title = "Utterances by Participant, Dataset and Task",
    subtitle = "",
    x = "number of coded utterances",
    y = "participant",
    fill = "DATASET"
  )

3.1.4 through TIME

#DOTPLOT—BW
ggplot(df_coded, aes(x=relative_time, y = PNUM)) + 
  geom_point(alpha=0.5, size=3) +
  facet_grid(df_coded$TASK) +
  scale_color_brewer(type="qual", palette = 3) +
  theme_minimal() + labs(
    title = "Participant Utterances over timecourse of Task",
    x= "timecourse of task (seconds)", y = "Participant",
    color = "Topic"
  ) 

#HISTOGRAMS BY TASK
ggplot(df_coded, aes(x = relative_time)) + 
  geom_histogram(binwidth = 30,aes(y=..density..)) + 
  geom_density()+
  facet_grid(df_coded$TASK) +
  theme_minimal() + labs(
    title = "Participant Utterances over timecourse of Task",
    x= "timecourse of task (seconds)", y = "frequency of utterances",
  ) + theme_minimal() + theme(legend.position = "blank")
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.

3.2 [TOPIC of] Utterances

NEXT we explore the distribution of utterances coded by high level TOPIC, across Analysis Task, Dataset, Participant and Time.

RQ: What kinds of things did participants talk aloud during EDA? Did they progress through any ‘topical phases’ over the course of the task? Or are topics equally distributed across analysis time?

Answer: Inspection of frequency tables and visualizations suggests that:

  1. Individual differences continue to play an important role

  2. There do not appear to be strong TASK/DATASET effects on topic that are consistent across participants.

  3. PROCESS and RELATIONSHIP topics are more evenly distributed across the timecourse of analysis, while DATASET AND VARIABLE topics are more tightly clustered near the beginning of the analysis. This pattern of distribution is sensical given what we know about EDA, and is consistent with the intuition that patterns of thought during EDA are likely more iterative and situational than we think (or model).

3.2.1 TOPICS

freq(df_coded$code_topic, 
     cumul      = FALSE,
     headings   = TRUE,
     report.nas = FALSE,
     plain.ascii = FALSE) 
## ### Frequencies  
## #### df_coded$code_topic  
## **Type:** Factor  
## 
##             &nbsp;   Freq        %
## ------------------ ------ --------
##        **PROCESS**    160    21.56
##        **DATASET**    176    23.72
##       **VARIABLE**    122    16.44
##   **RELATIONSHIP**    284    38.27
##          **Total**    742   100.00
gf_bar(~code_topic, fill = ~code_topic, position="stack", data = df_coded) + 
  scale_fill_brewer(type="qual", palette = 3) 

df_summary <- df_coded %>% 
  group_by(code_topic) %>% 
  summarise(n = n()) %>%
  mutate(freq = n / sum(n),
         dumm =  factor("x"))

#TOPIC 
ggplot(df_summary, aes(y=freq, x = dumm, fill= fct_rev(code_topic))) + 
  geom_col() + 
  geom_text(aes(label=paste(round(freq*100,0),"%")), size = 3, hjust = 0.5, vjust = 1.5, position = "stack") + 
  scale_fill_brewer(type="qual", palette = 3) +
  labs( title = "TOPIC OF UTTERANCES",
        subtitle = "",
        caption = "The most frequent topic of utterance was relationship (38%) \n with only 16% concerned with nature of variables",
        x= "TASK", y = "count") + theme_minimal() 

3.2.2 by TASK

#COUNT BY TASK
ctable(x = df_coded$code_topic, 
       y = df_coded$TASK, 
       prop = "r")  

Cross-Tabulation, Row Proportions
code_topic * TASK
Data Frame: df_coded

TASK static ixn Total
code_topic
PROCESS 92 (57.5%) 68 (42.5%) 160 (100.0%)
DATASET 100 (56.8%) 76 (43.2%) 176 (100.0%)
VARIABLE 77 (63.1%) 45 (36.9%) 122 (100.0%)
RELATIONSHIP 134 (47.2%) 150 (52.8%) 284 (100.0%)
Total 403 (54.3%) 339 (45.7%) 742 (100.0%)
#DF SUMMARIZED BY TASK + DATASET
df_summary <- df_coded %>% 
  group_by(code_topic, TASK) %>% 
  dplyr::summarise(
    c = n()
  )

#STACKED BAR BY TASK
ggplot(df_summary, aes(x = TASK, y=c, fill= fct_rev(code_topic))) + 
  geom_col() + 
  geom_text(aes(label=c), size = 3, hjust = 0.5, vjust = 1.5, position = "stack") + 
  scale_fill_brewer(type="qual", palette = 3) +
  labs( title = "TOPICS by TASK",
        subtitle = "",
        x= "TASK", y = "count", fill="TOPIC") + theme_minimal() 

# + theme(legend.position = "blank")

3.2.3 by TASK and DATASET

#DF SUMMARIZED BY TASK + DATASET
df_summary <- df_coded %>%
  group_by(code_topic, TASK,DATASET) %>%
  dplyr::summarise(
    c = n()
  )

#PAPER FIGURE HERE
#STACKED BAR BY TASK FACET DATASET
(p <- ggplot(df_summary, aes(x = TASK, y=c, fill= fct_rev(code_topic))) +
  facet_wrap(df_summary$DATASET) +
  geom_col() +
  geom_text(aes(label=c), size = 3, hjust = 0.5, vjust = 1.5, position = "stack") +
  scale_fill_brewer(type="qual", palette = 3) +
  labs( title = "TOPICS by TASK and DATASET",
        subtitle = "",
        x= "TASK", y = "count", fill="TOPIC") + theme_minimal())

# + theme(legend.position = "blank")

ggsave(p, file="figures/UTTERANCE_topics_by_factors.png")

3.2.4 by PARTICIPANT

#COUNT BY PARTICIPANT 
ctable(x = df_coded$PNUM, 
       y = df_coded$code_topic, 
       prop = "r")  

Cross-Tabulation, Row Proportions
PNUM * code_topic
Data Frame: df_coded

code_topic PROCESS DATASET VARIABLE RELATIONSHIP Total
PNUM
P6 6 (20.7%) 5 (17.2%) 5 (17.2%) 13 (44.8%) 29 (100.0%)
P9 19 (18.4%) 36 (35.0%) 23 (22.3%) 25 (24.3%) 103 (100.0%)
P10 8 (18.6%) 11 (25.6%) 8 (18.6%) 16 (37.2%) 43 (100.0%)
P2 3 (10.7%) 8 (28.6%) 16 (57.1%) 1 ( 3.6%) 28 (100.0%)
P4 10 (17.9%) 6 (10.7%) 11 (19.6%) 29 (51.8%) 56 (100.0%)
P12 21 (24.1%) 28 (32.2%) 17 (19.5%) 21 (24.1%) 87 (100.0%)
P13 41 (46.6%) 7 ( 8.0%) 8 ( 9.1%) 32 (36.4%) 88 (100.0%)
P5 6 ( 7.3%) 18 (22.0%) 18 (22.0%) 40 (48.8%) 82 (100.0%)
P7 14 (29.2%) 14 (29.2%) 7 (14.6%) 13 (27.1%) 48 (100.0%)
P8 10 (19.6%) 7 (13.7%) 3 ( 5.9%) 31 (60.8%) 51 (100.0%)
P3 7 (13.0%) 6 (11.1%) 2 ( 3.7%) 39 (72.2%) 54 (100.0%)
P1 8 (32.0%) 4 (16.0%) 0 ( 0.0%) 13 (52.0%) 25 (100.0%)
P11 7 (14.6%) 26 (54.2%) 4 ( 8.3%) 11 (22.9%) 48 (100.0%)
Total 160 (21.6%) 176 (23.7%) 122 (16.4%) 284 (38.3%) 742 (100.0%)
#PAPER FIGURE HERE
#TOPICS by PARTICPANT facet TASK
(p <- gf_bar( PNUM ~., fill = ~ fct_rev(code_topic), data = df_coded) %>% 
  gf_facet_grid(.~TASK) + 
  scale_fill_brewer(type="qual", palette = 3) +
  labs(
    title = "Utterances by Participant, Dataset and Task",
    subtitle = "",
    x = "number of coded utterances",
    y = "participant",
    fill = "TOPIC"
  ) + theme_minimal())

ggsave(p, file="figures/UTTERANCE_topics_by_participant.png")



# #TOPICS by PARTICPANT facet TASK
# gf_bar( PNUM ~., fill = ~ fct_rev(code_topic), data = df_coded) %>% 
#   gf_facet_grid(DATASET~TASK) + 
#   scale_fill_brewer(type="qual", palette = 3) +
#   labs(
#     title = "Utterances by Participant, Dataset and Task",
#     subtitle = "",
#     x = "number of coded utterances",
#     y = "participant",
#     fill = "DATASET"
#   )

3.2.5 by TIME

#HISTOGRAMS BY TASK
ggplot(df_coded, aes(x = relative_time)) + 
  geom_histogram(binwidth = 30,aes(y=..density.., fill = fct_rev(code_topic), color = fct_rev(code_topic))) + 
  geom_density()+
  facet_grid(df_coded$code_topic ~ df_coded$TASK) +
  scale_fill_brewer(type="qual", palette = 3) +
  scale_color_brewer(type="qual", palette = 3) +
  theme_minimal() + labs(
    title = "Topic of Utterance over timecourse of Task",
    x= "timecourse of task (seconds)", y = "frequency of utterances",
    fill = "Topic"
  ) + theme_minimal() + theme(legend.position = "blank")

#DOTPLOT
(p <- ggplot(df_coded, aes(x=relative_time, y = PNUM, color=fct_rev(code_topic))) + 
  geom_point(alpha=0.5, size=3) +
  facet_grid(df_coded$TASK) +
  scale_color_brewer(type="qual", palette = 3) +
  theme_minimal() + labs(
    title = "Topic of Utterances over timecourse of Task",
    x= "timecourse of task (seconds)", y = "Task",
    color = "Topic"
  ))

ggsave(p, file="figures/UTTERANCE_topics_by_time_FACET.png")
## Saving 7 x 5 in image
#PAPER FIGURE HERE
#DOTPLOT
(p <- ggplot(df_coded, aes(x=relative_time, y = fct_rev(TASK), color=fct_rev(code_topic))) + 
  geom_point(alpha=0.5, size=3) +
  facet_grid(df_coded$PNUM) +
  # facet_grid(df_coded$TASK ~ df_coded$DATASET) +
  scale_color_brewer(type="qual", palette = 3) +
  theme_minimal() + labs(
    title = "Topic of Utterances over timecourse of Task",
    x= "timecourse of task (seconds)", y = "Participant",
    color = "Topic"
)) 

ggsave(p, file="figures/UTTERANCE_topics_by_time_STACK.png")
## Saving 7 x 5 in image

3.3 [DETAIL of] Utterances

NEXT we explore the distribution of specific detail utterances across Analysis Task, Dataset, Participant and Time.

RQ: What specific things did participants talk aloud during EDA? Are there any details folks only mention during static(v)interactive, or nominal(v)numeric tasks? Any substantial changes in proportion by TASK or DATASET?

Answer:

3.3.1 DETAIL—PROCESS

#PREP DATA FRAMES
df_process <- df_coded %>% 
  filter(code_topic=="PROCESS") %>% 
  dplyr::select(pid,PNUM,TASK,DATASET,code_detail)

df_time_process <- df_coded %>% 
  filter(code_topic=="PROCESS") %>% 
  dplyr::select(pid,PNUM,TASK,DATASET,relative_time,code_detail)

df_summary_task <- df_process %>% 
  group_by(code_detail, TASK) %>% 
  dplyr::summarise(c = n())

df_summary_dataset <- df_process %>% 
  group_by(code_detail, DATASET) %>% 
  dplyr::summarise(c = n())


#DETAILS BY TASK
ggplot(df_summary_task, aes(x = TASK, y=c, fill= code_detail)) + 
  geom_col() + 
  geom_text(aes(label=c), size = 3, hjust = 0.5, vjust = 1.5, position = "stack") + 
  scale_fill_brewer(type="seq", palette = "PuRd") +
  labs( title = "PROCESS Utterances by TASK",
        subtitle = "",
        caption = "weak to moderate difference in PROCESS utterances by TASK, \n but these do not seem substantial when broken into the two categories",
        x= "TASK", y = "count") + theme_minimal() 

#DETAILS BY DATASET
ggplot(df_summary_dataset, aes(x = DATASET, y=c, fill= code_detail)) + 
  geom_col() + 
  geom_text(aes(label=c), size = 3, hjust = 0.5, vjust = 1.5, position = "stack") + 
  scale_fill_brewer(type="seq", palette = "PuRd") +
  labs( title = "PROCESS Utterances by DATASET",
        subtitle = "",
        caption = "much more substantial differences in PROCESS utterances by DATASET, \n consistent with intution Ps had more to say about the numeric (vs) nominal outcome variable",
        x= "DATASET", y = "count") + theme_minimal() 

#DETAILS DOTPLOT
ggplot(df_time_process, aes(x=relative_time, y = PNUM, color=fct_rev(code_detail))) + 
  geom_point(alpha=0.5, size=3) +
  facet_grid(df_time_process$TASK) +
  scale_color_brewer(type="seq", palette = "PuRd") +
  theme_minimal() + labs(
    title = "PROCESS Utterances by timecourse of Task",
    caption = "appear randomly distributed through time \n expected and reasonable given PROCESS utterances are meta-level",
    x= "timecourse of task (seconds)", y = "Participant",
    color = "Topic"
  ) 

#DETAIL HISTOGRAMS BY TASK
ggplot(df_time_process, aes(x = relative_time, fill = fct_rev(code_detail))) + 
  geom_histogram(binwidth = 30) + 
  facet_grid(df_time_process$TASK ~ df_time_process$code_detail ) +
  scale_fill_brewer(type="seq", palette = "PuRd") +
  theme_minimal() + labs(
    title = "PROCESS Utterances by timecourse of Task",
    x= "timecourse of task (seconds)", y = "frequency of utterances"
  ) + theme_minimal() + theme(legend.position = "blank")

#PAPER FIGURE HERE
#PROCESSES by PARTICPANT facet TASK
(p <- gf_bar( PNUM ~., fill = ~ fct_rev(code_detail), data = df_process) %>% 
  gf_facet_grid(.~TASK) + 
  scale_fill_brewer(type="seq", palette = "PuRd") +
  labs(
    title = "PROCESS Utterances by Participant and Task",
    subtitle = "",
    caption = "TODO explore P13 representation comments",
    x = "number of coded utterances",
    y = "participant",
    fill = "TOPIC"
  ) + theme_minimal()
)

ggsave(p, file="figures/UTTERANCE_detail_PROCESS_participants.png")

3.3.1.1 PROCESS-REPS-HIGH

PROOF OF CONCEPT FOR reporting REPRESENTATIONS (high level type) LINKED TO UTTERANCES

#FILTER JOINED DATAFRAME
df <- df_joined %>% filter(code_topic =="PROCESS")

#AGGREGATE PROCESS X REP TYPE
gf_bar( ~ rep_type, fill = ~code_detail , data = df) %>% 
  gf_facet_grid(TASK~code_detail) + 
  scale_fill_brewer(type="seq", palette = "PuRd") + 
  labs(title = "PROCESS Utterances & Representations ",
    subtitle = "",
    caption = "",
    x = "REPN",
    y = "number of coded utterances",
    fill = "TOPIC")+
  theme_minimal()

#PAPER FIGURE HERE MAYBE
#PARTICIPANT PROCESS X REP 
(p <- gf_bar( PNUM ~., fill = ~ fct_rev(code_detail), data = df) %>% 
  gf_facet_grid(rep_type~TASK) + 
  scale_fill_brewer(type="seq", palette = "PuRd") +
  labs(
    title = "PROCESS Utterances & Representations ",
    subtitle = "",
    caption = "",
    x = "number of coded utterances",
    y = "participant",
    fill = "TOPIC"
  ) + theme_minimal()
)

3.3.1.2 PROCESS-REPS-LOW STUCK HERE

WHOAH nelly how to [heatmap??] a contingency table? Most likely need to reduce REP list by removing zero cells and just focus on what reps WERE used

#FILTER JOINED DATAFRAME
# df <- df_joined %>% filter(code_topic =="PROCESS")
# 
# c <- table(  df$REP, df$code_detail)
# d <- as.data.frame.matrix(c)
# 
# plot(d)



#HERE
vcd::mosaic(formula = ~REP + code_detail,
       data = df,
       main = "Proportion of Utterances by TASK and DATASET",
       sub = "u = 734 utterance-codes",
       # labeling = labeling_values,
       labeling_args = list(set_varnames = c(graph = "TASK",
                            datset = "DATASET")))

# #AGGREGATE PROCESS X REP TYPE
# gf_bar( ~ rep_type, fill = ~code_detail , data = df) %>%
#   gf_facet_grid(TASK~code_detail) +
#   scale_fill_brewer(type="seq", palette = "PuRd") +
#   labs(title = "PROCESS Utterances & Representations ",
#     subtitle = "",
#     caption = "",
#     x = "REPN",
#     y = "number of coded utterances",
#     fill = "TOPIC")+
#   theme_minimal()
#
# #PARTICIPANT PROCESS X REP
# (p <- gf_bar( PNUM ~., fill = ~ fct_rev(code_detail), data = df) %>%
#   gf_facet_grid(rep_type~TASK) +
#   scale_fill_brewer(type="seq", palette = "PuRd") +
#   labs(
#     title = "PROCESS Utterances & Representations ",
#     subtitle = "",
#     caption = "",
#     x = "number of coded utterances",
#     y = "participant",
#     fill = "TOPIC"
#   ) + theme_minimal()
# )

3.3.2 DETAIL—DATASET

#PREP DATA FRAMES
df_dataset <- df_coded %>% 
  filter(code_topic=="DATASET") %>% 
  dplyr::select(pid,PNUM,TASK,DATASET,code_detail)

df_time_dataset <- df_coded %>% 
  filter(code_topic=="DATASET") %>% 
  dplyr::select(pid,PNUM,TASK,DATASET,relative_time,code_detail)

df_summary_task <- df_dataset %>% 
  group_by(code_detail, TASK) %>% 
  dplyr::summarise(c = n())

df_summary_dataset <- df_dataset %>% 
  group_by(code_detail, DATASET) %>% 
  dplyr::summarise(c = n())


#DETAILS BY TASK
ggplot(df_summary_task, aes(x = TASK, y=c, fill= code_detail)) + 
  geom_col() + 
  geom_text(aes(label=c), size = 3, hjust = 0.5, vjust = 1.5, position = "stack") + 
  scale_fill_brewer(type="seq", palette = 4) +
  labs( title = "DATASET Utterances by TASK",
        subtitle = "",
        caption = "notable decrease MISSING DATA utterances in IXN \n unsure what might explain this, explore at individual level",
        x= "TASK", y = "count") + theme_minimal() 

#DETAILS BY DATASET
ggplot(df_summary_dataset, aes(x = DATASET, y=c, fill= code_detail)) + 
  geom_col() + 
  geom_text(aes(label=c), size = 3, hjust = 0.5, vjust = 1.5, position = "stack") + 
  scale_fill_brewer(type="seq", palette = 4) +
  labs( title = "DATASET Utterances by DATASET",
        subtitle = "",
        caption = "minor differences by DATASET  reasonable given DATASET \n  represenations are typically tabluar (data dictionary, describe, head/tail)",
        x= "DATASET", y = "count") + theme_minimal() 

#DETAILS DOTPLOT
ggplot(df_time_dataset, aes(x=relative_time, y = PNUM, color=fct_rev(code_detail))) + 
  geom_point(alpha=0.5, size=3) +
  facet_grid(df_time_dataset$TASK) +
  scale_color_brewer(type="seq", palette = 4) +
  theme_minimal() + labs(
    title = "DATASET Utterances by timecourse of Task",
    x= "timecourse of task (seconds)", y = "Participant",
    caption = "notable sparsity in center of timecourse, reasonable as EDA normative behavior \n is to consider dataframe shape and missing data at the start of an analysis",
    color = "Topic"
  ) 

#DETAIL HISTOGRAMS BY TASK
ggplot(df_time_dataset, aes(x = relative_time, fill = fct_rev(code_detail))) + 
  geom_histogram(binwidth = 30) + 
  facet_grid(df_time_dataset$TASK ~ df_time_dataset$code_detail ) +
  scale_fill_brewer(type="seq", palette = 4) +
  theme_minimal() + labs(
    title = "DATASET Utterances by timecourse of Task",
    x= "timecourse of task (seconds)", y = "frequency of utterances",
        caption = "sensical that the most uniformly distributed detail code is missing data \n as this can be discovered via graphing",
  ) + theme_minimal() + theme(legend.position = "blank")

#PAPER FIGURE HERE
#DATASET UTTERANCES by PARTICPANT facet TASK
(p <- gf_bar( PNUM ~., fill = ~ fct_rev(code_detail), data = df_dataset) %>% 
  gf_facet_grid(.~TASK) + 
  scale_fill_brewer(type="seq", palette = 4) +
  labs(
    title = "DATASET Utterances by Participant and Task",
    subtitle = "",
    x = "number of coded utterances",
    y = "participant",
    fill = "TOPIC",
    caption = "P11, P12 contribute to MISSING DATA \n P9 contributes largely to variable metadata \n INVESTIGATE FURTHER",
  ) + theme_minimal()
)

ggsave(p, file="figures/UTTERANCE_detail_DATASET_participants.png")

3.3.3 DETAIL—VARIABLE

#PREP DATA FRAMES
df_variable <- df_coded %>% 
  filter(code_topic=="VARIABLE") %>% 
  dplyr::select(pid,PNUM,TASK,DATASET,code_detail)

df_time_variable <- df_coded %>% 
  filter(code_topic=="VARIABLE") %>% 
  dplyr::select(pid,PNUM,TASK,DATASET,relative_time,code_detail)

df_summary_task <- df_variable %>% 
  group_by(code_detail, TASK) %>% 
  dplyr::summarise(c = n())

df_summary_dataset <- df_variable %>% 
  group_by(code_detail, DATASET) %>% 
  dplyr::summarise(c = n())


#DETAILS BY TASK
ggplot(df_summary_task, aes(x = TASK, y=c, fill= code_detail)) + 
  geom_col() + 
  geom_text(aes(label=c), size = 3, hjust = 0.5, vjust = 1.5, position = "stack") + 
  scale_fill_brewer(type="seq", palette = 5) +
  labs( title = "VARIABLE Utterances by TASK",
        subtitle = "",
        caption = "notably more RANGE obs in STATIC than INTERACTIVE \n TODO consider collapsing distribution-variance ",
        x= "TASK", y = "count") + theme_minimal() 

#DETAILS BY DATASET
ggplot(df_summary_dataset, aes(x = DATASET, y=c, fill= code_detail)) + 
  geom_col() + 
  geom_text(aes(label=c), size = 3, hjust = 0.5, vjust = 1.5, position = "stack") + 
  scale_fill_brewer(type="seq", palette = 5) +
  labs( title = "VARIABLE Utterances by DATASET",
        caption = "Notably more SHAPE in SPACE than HAPPINESS \n notably fewer RANGE in SPACE than HAPPINESS \n TODO are shape and range normalized across variable types? ",
        subtitle = "",
        x= "DATASET", y = "count") + theme_minimal() 

#DETAILS DOTPLOT
ggplot(df_time_variable, aes(x=relative_time, y = PNUM, color=fct_rev(code_detail))) + 
  geom_point(alpha=0.5, size=3) +
  facet_grid(df_time_variable$TASK) +
  scale_color_brewer(type="seq", palette = 5) +
  theme_minimal() + labs(
    title = "VARIABLE Utterances by timecourse of Task",
    caption = "IXN appears more BIMODAL than STATIC where the distribution is more uniform",
    x= "timecourse of task (seconds)", y = "Participant",
    color = "Topic"
  ) 

#DETAIL HISTOGRAMS BY TASK
ggplot(df_time_variable, aes(x = relative_time, fill = fct_rev(code_detail))) + 
  geom_histogram(binwidth = 30) + 
  facet_grid(df_time_variable$TASK ~ df_time_variable$code_detail ) +
  scale_fill_brewer(type="seq", palette = 5) +
  theme_minimal() + labs(
    title = "VARIABLE Utterances by timecourse of Task",
    x= "timecourse of task (seconds)", y = "frequency of utterances"
  ) + theme_minimal() + theme(legend.position = "blank")

#PAPER FIGURE HERE
#VARIABLE UTTERANCES by PARTICPANT facet TASK
(p <- gf_bar( PNUM ~., fill = ~ fct_rev(code_detail), data = df_variable) %>% 
  gf_facet_grid(.~TASK) + 
  scale_fill_brewer(type="seq", palette = 5) +
  labs(
    title = "VARIABLE Utterances by Participant and Task",
    subtitle = "",
    x = "number of coded utterances",
    y = "participant",
    fill = "TOPIC",
    caption = "substantial individual differences, most everyone made some comments \n at some point about distribution shape, but \n discussion of outliers, variance and range was more idiosyncratic \n TODO INVESTIGATE P4 STATIC RANGE very high",
  ) + theme_minimal()
)

ggsave(p, file="figures/UTTERANCE_detail_VARIABLE_participants.png")

3.3.4 DETAIL—RELATIONSHIP

#PREP DATA FRAMES
df_relationship <- df_coded %>% 
  filter(code_topic=="RELATIONSHIP") %>% 
  dplyr::select(pid,PNUM,TASK,DATASET,code_detail)

df_time_relationship <- df_coded %>% 
  filter(code_topic=="RELATIONSHIP") %>% 
  dplyr::select(pid,PNUM,TASK,DATASET,relative_time,code_detail)

df_summary_task <- df_relationship %>% 
  group_by(code_detail, TASK) %>% 
  dplyr::summarise(c = n())

df_summary_relationship <- df_relationship %>% 
  group_by(code_detail, DATASET) %>% 
  dplyr::summarise(c = n())


#DETAILS BY TASK
ggplot(df_summary_task, aes(x = TASK, y=c, fill= code_detail)) + 
  geom_col() + 
  geom_text(aes(label=c), size = 3, hjust = 0.5, vjust = 1.5, position = "stack") + 
  scale_fill_brewer(type="seq", palette = 3) +
  labs( title = "RELATIONSHIP Utterances by TASK",
        subtitle = "",
        caption = "TODO think about proportion of existence and strength/direction",
        x= "TASK", y = "count") + theme_minimal() 

#DETAILS BY DATASET
ggplot(df_summary_relationship, aes(x = DATASET, y=c, fill= code_detail)) + 
  geom_col() + 
  geom_text(aes(label=c), size = 3, hjust = 0.5, vjust = 1.5, position = "stack") + 
  scale_fill_brewer(type="seq", palette = 3) +
  labs( title = "RELATIONSHIP Utterances by DATASET",
        subtitle = "",
        caption = "substantial differences by DATASET, \n consistent with pattern of results with VARIABLE utterances \n perhaps due to sparsity of knowledge of analysis of nominal X nominal relationships, and tool coverage",
        x= "DATASET", y = "count") + theme_minimal() 

##CONSIDER THIS
gf_bar( ~code_detail, fill = ~code_detail, data = df_relationship) %>% 
  gf_facet_grid(TASK ~ DATASET) + labs(
    title="LOOK at this more closely for data validation on relationship utts",
    caption = "TODO CONSIDER THIS",
  )

#DETAILS DOTPLOT
ggplot(df_time_relationship, aes(x=relative_time, y = PNUM, color=fct_rev(code_detail))) + 
  geom_point(alpha=0.5, size=3) +
  facet_grid(df_time_relationship$TASK) +
  scale_color_brewer(type="seq", palette = 3) +
  theme_minimal() + labs(
    title = "RELATIONSHIP Utterances by timecourse of Task",
    caption = "uniformly distributed, as expected \n may see variance if dimension of HYPOTHESIS vs OBSERVATION was coded",
    x= "timecourse of task (seconds)", y = "Participant",
    color = "Topic"
  ) 

#DETAIL HISTOGRAMS BY TASK
ggplot(df_time_relationship, aes(x = relative_time, fill = fct_rev(code_detail))) + 
  geom_histogram(binwidth = 30) + 
  facet_grid(df_time_relationship$TASK ~ df_time_relationship$code_detail ) +
  scale_fill_brewer(type="seq", palette = 3) +
  theme_minimal() + labs(
    title = "RELATIONSHIP Utterances by timecourse of Task",
    x= "timecourse of task (seconds)", y = "frequency of utterances"
  ) + theme_minimal() + theme(legend.position = "blank")

#PAPER FIGURE HERE
#RELATIONSHIP UTTERANCES by PARTICPANT facet TASK
(p <- gf_bar( PNUM ~., fill = ~ fct_rev(code_detail), data = df_relationship) %>% 
  gf_facet_grid(.~TASK) + 
  scale_fill_brewer(type="seq", palette = 3) +
  labs(
    title = "RELATIONSHIP Utterances by Participant and Task",
    subtitle = "",
    x = "number of coded utterances",
    y = "participant",
    fill = "TOPIC"
  ) + theme_minimal()
)

ggsave(p, file="figures/UTTERANCE_detail_RELATIONSHIP_participants.png")

4 REPRESENTATIONS

Representations are computationally-generated visual-spatial artifacts that participants use during the EDA tasks. These include data visualizations, but also tabular code outputs, or other data structures returned by Python code.

In the following subsections we start by exploring the distribution of kinds of representations participants generated based on TASK, DATASET, and PARTICIPANT. We then explore a subset of representations that are associated with utterances (i.e. not every representation is commented on). And finally we dig further into the utterances that are associated with realtime interaction with an interactive visualization (i.e. not every representation generated during the IXN task is interactive, and not every interactive visualization generated is acted upon by the participant).

4.1 [TELEMETRY] Representations

RQ: How many representations did participants generate? Of what kinds? At what times during the process of analysis? (These are the representations coming from telemetry. Not connected to utterances explicitly. Code only reps are excluded, but tabular output of code cells (eg. .info(), .describe(), etc. included)

4.1.1 NUMBER of REPS

#COUNT BY TASK AND DATASET
ctable(x = df_telemetry$TASK, 
       y = df_telemetry$DATASET, 
       prop = "t")  
## Cross-Tabulation, Total Proportions  
## TASK * DATASET  
## Data Frame: df_telemetry  
## 
## -------- --------- ------------- ------------- --------------
##            DATASET     happiness         space          Total
##     TASK                                                     
##   static             127 (25.2%)   105 (20.8%)   232 ( 46.0%)
##      ixn             174 (34.5%)    98 (19.4%)   272 ( 54.0%)
##    Total             301 (59.7%)   203 (40.3%)   504 (100.0%)
## -------- --------- ------------- ------------- --------------
#DF SUMMARIZED BY TASK + DATASET
df_summary <- df_telemetry %>% 
  group_by(TASK,DATASET) %>% 
  dplyr::summarise(
    c = n()
  )
## `summarise()` has grouped output by 'TASK'. You can override using the
## `.groups` argument.
#STACKED BAR BY TASK AND DATASET
ggplot(df_summary, aes(x = TASK, y=c, fill= DATASET)) + 
  geom_col() + 
  geom_text(aes(label=c), size = 3, hjust = 0.5, vjust = 1.5, position = "stack") + 
  # scale_fill_brewer(type="qual", palette = 4) +
  labs( title = "(Telemetry) Representations by TASK and DATASET",
        subtitle = "",
        x= "TASK", y = "count") + theme_minimal() 

# + theme(legend.position = "blank")

4.1.2 HIGH LEVEL by TASK DATASET

#DF SUMMARIZED BY TASK + DATASET
df_summary <- df_telemetry %>%
  group_by(rep_type, TASK,DATASET) %>%
  dplyr::summarise(
    c = n()
  )
## `summarise()` has grouped output by 'rep_type', 'TASK'. You can override using
## the `.groups` argument.
#PAPER FIGURE HERE
#STACKED BAR BY TASK FACET DATASET
(p <- ggplot(df_summary, aes(x = TASK, y=c, fill= fct_rev(rep_type))) +
  facet_wrap(df_summary$DATASET) +
  geom_col() +
  geom_text(aes(label=c), size = 3, hjust = 0.5, vjust = 1.5, position = "stack") +
  scale_fill_brewer(type="qual", palette = 1, direction = -1) +
  labs( title = "[ TELEMETRY] REPRESENTATIONS by TASK and DATASET",
        subtitle = "",
        x= "TASK", y = "count", fill="REP-TYPE") + theme_minimal()
)

ggsave(p, file="figures/TELEMETRY_classes_by_factors.png")
## Saving 7 x 5 in image
# + theme(legend.position = "blank")

# #HIGH LEVEL
# gf_bar(  ~ rep_type, fill = ~rep_type, position="stack", data = df_telemetry) %>% 
#   gf_facet_grid( DATASET ~ TASK) +
#   scale_fill_brewer(type="qual", palette = 1) +
#   theme_minimal() + labs(
#     title = "[TELEMETRY]  Representations HIGH"
#   )

4.1.3 DETAIL REPS

#COUNT BY TASK 
ctable(x = df_telemetry$REP, 
       y = df_telemetry$TASK, 
       prop = "t")  
## Cross-Tabulation, Total Proportions  
## REP * TASK  
## Data Frame: df_telemetry  
## 
## ---------------- ------ ------------- ------------- --------------
##                    TASK        static           ixn          Total
##              REP                                                  
##          profile           25 ( 5.0%)    17 ( 3.4%)    42 (  8.3%)
##          barplot            8 ( 1.6%)    35 ( 6.9%)    43 (  8.5%)
##          columns           21 ( 4.2%)     9 ( 1.8%)    30 (  6.0%)
##        dataframe           59 (11.7%)    42 ( 8.3%)   101 ( 20.0%)
##         describe           13 ( 2.6%)     7 ( 1.4%)    20 (  4.0%)
##          heatmap            4 ( 0.8%)     3 ( 0.6%)     7 (  1.4%)
##             hist           18 ( 3.6%)     1 ( 0.2%)    19 (  3.8%)
##             info            3 ( 0.6%)     3 ( 0.6%)     6 (  1.2%)
##         lineplot           13 ( 2.6%)     5 ( 1.0%)    18 (  3.6%)
##   multiviewchart            3 ( 0.6%)    40 ( 7.9%)    43 (  8.5%)
##         pairplot            4 ( 0.8%)     5 ( 1.0%)     9 (  1.8%)
##      scatterplot           46 ( 9.1%)   105 (20.8%)   151 ( 30.0%)
##        stripplot           15 ( 3.0%)     0 ( 0.0%)    15 (  3.0%)
##            Total          232 (46.0%)   272 (54.0%)   504 (100.0%)
## ---------------- ------ ------------- ------------- --------------
#COUNT BY DATASET
ctable(x = df_telemetry$REP, 
       y = df_telemetry$DATASET, 
       prop = "t")  
## Cross-Tabulation, Total Proportions  
## REP * DATASET  
## Data Frame: df_telemetry  
## 
## ---------------- --------- ------------- ------------- --------------
##                    DATASET     happiness         space          Total
##              REP                                                     
##          profile              19 ( 3.8%)    23 ( 4.6%)    42 (  8.3%)
##          barplot              19 ( 3.8%)    24 ( 4.8%)    43 (  8.5%)
##          columns              16 ( 3.2%)    14 ( 2.8%)    30 (  6.0%)
##        dataframe              60 (11.9%)    41 ( 8.1%)   101 ( 20.0%)
##         describe              10 ( 2.0%)    10 ( 2.0%)    20 (  4.0%)
##          heatmap               4 ( 0.8%)     3 ( 0.6%)     7 (  1.4%)
##             hist               4 ( 0.8%)    15 ( 3.0%)    19 (  3.8%)
##             info               3 ( 0.6%)     3 ( 0.6%)     6 (  1.2%)
##         lineplot              18 ( 3.6%)     0 ( 0.0%)    18 (  3.6%)
##   multiviewchart              22 ( 4.4%)    21 ( 4.2%)    43 (  8.5%)
##         pairplot               6 ( 1.2%)     3 ( 0.6%)     9 (  1.8%)
##      scatterplot             120 (23.8%)    31 ( 6.2%)   151 ( 30.0%)
##        stripplot               0 ( 0.0%)    15 ( 3.0%)    15 (  3.0%)
##            Total             301 (59.7%)   203 (40.3%)   504 (100.0%)
## ---------------- --------- ------------- ------------- --------------
#PAPER FIGURE HERE
#DETAIL
(p <- gf_bar(  ~ REP, fill = ~fct_rev(rep_type), data = df_telemetry) %>% 
  gf_facet_grid( DATASET ~ TASK) + 
  scale_fill_brewer(type="qual", palette = 1, direction = -1) +
  coord_flip() + 
  scale_x_discrete(limits = c( 
                          "multiviewchart",
                          "heatmap",
                          "pairplot",
                          "stripplot",
                          "lineplot",
                          "scatterplot",
                          "barplot",
                          "profile",
                          "hist",
                          "describe",
                          "info",
                          "columns",
                          "dataframe"
    
  ))+
  theme_minimal() + labs(
    title = "[TELEMETRY]  Representations DETAIL by FACTORS"
))

ggsave(p, file="figures/TELEMETRY_detail_by_factors.png", width = 7, height = 5, units ="in")


#flipped
# gf_bar(  ~ REP, fill = ~fct_rev(rep_type), data = df_telemetry) %>% 
#   gf_facet_grid( TASK ~ DATASET) + 
#   scale_fill_brewer(type="qual", palette = 1, direction = -1) +
#   coord_flip() + 
#   scale_x_discrete(limits = c( 
#                           "multiviewchart",
#                           "heatmap",
#                           "pairplot",
#                           "stripplot",
#                           "lineplot",
#                           "scatterplot",
#                           "barplot",
#                           "profile",
#                           "hist",
#                           "describe",
#                           "info",
#                           "columns",
#                           "dataframe"
#     
#   ))+
#   theme_minimal() + labs(
#     title = "[TELEMETRY]  Representations DETAIL by FACTORS"
# )

# #DF BY REP
# df_summary <- df_telemetry %>% 
#   group_by(REP) %>% 
#   dplyr::summarise(
#     c = n()
#   )

# #STACKED BAR 
# ggplot(df_summary, aes(x = REP, y=c, fill= REP)) + 
#   geom_col() + 
#   geom_text(aes(label=c), size = 3, hjust = 0.5, vjust = 1.5, position = "stack") + 
#   # scale_fill_brewer(type="qual", palette = 4) +
#   coord_flip()+
#   labs( title = "(Telemetry) Representations DETAIL",
#         subtitle = "",
#         x= "TASK", y = "count") + theme_minimal() 
# # + theme(legend.position = "blank")

4.1.4 WIP

#COUNT BY TASK 
ctable(x = df_telemetry$rep_type, 
       y = df_telemetry$TASK, 
       prop = "t")  
## Cross-Tabulation, Total Proportions  
## rep_type * TASK  
## Data Frame: df_telemetry  
## 
## ---------- ------ ------------- ------------- --------------
##              TASK        static           ixn          Total
##   rep_type                                                  
##      CHART          136 (27.0%)   211 (41.9%)   347 ( 68.8%)
##      TABLE           96 (19.0%)    61 (12.1%)   157 ( 31.2%)
##      Total          232 (46.0%)   272 (54.0%)   504 (100.0%)
## ---------- ------ ------------- ------------- --------------
#COUNT BY TASK AND DATASET
ctable(x = df_telemetry$rep_type, 
       y = df_telemetry$DATASET, 
       prop = "t")  
## Cross-Tabulation, Total Proportions  
## rep_type * DATASET  
## Data Frame: df_telemetry  
## 
## ---------- --------- ------------- ------------- --------------
##              DATASET     happiness         space          Total
##   rep_type                                                     
##      CHART             212 (42.1%)   135 (26.8%)   347 ( 68.8%)
##      TABLE              89 (17.7%)    68 (13.5%)   157 ( 31.2%)
##      Total             301 (59.7%)   203 (40.3%)   504 (100.0%)
## ---------- --------- ------------- ------------- --------------
#DF SUMMARIZED BY TASK + DATASET
df_summary <- df_telemetry %>% 
  group_by(rep_type, TASK,DATASET) %>% 
  dplyr::summarise(
    c = n()
  )
## `summarise()` has grouped output by 'rep_type', 'TASK'. You can override using
## the `.groups` argument.
#STACKED BAR BY TASK AND DATASET
ggplot(df_summary, aes(x = TASK, y=c, fill= DATASET)) + 
  geom_col() + 
  facet_grid(TASK ~ DATASET)+
  geom_text(aes(label=c), size = 3, hjust = 0.5, vjust = 1.5, position = "stack") + 
  # scale_fill_brewer(type="qual", palette = 4) +
  labs( title = "(Telemetry) Representations by TASK and DATASET",
        subtitle = "",
        x= "TASK", y = "count") + theme_minimal() 

# + theme(legend.position = "blank")

#DF BY REP
df_summary <- df_telemetry %>% 
  group_by(rep_type, TASK, DATASET) %>% 
  dplyr::summarise(
    c = n()
  )
## `summarise()` has grouped output by 'rep_type', 'TASK'. You can override using
## the `.groups` argument.
#STACKED BAR 
ggplot(df_summary, aes(x = rep_type, y=c, fill= rep_type)) + 
  geom_col() + 
  geom_text(aes(label=c), size = 3, hjust = 0.5, vjust = 1.5, position = "stack") + 
  # scale_fill_brewer(type="qual", palette = 4) +
  labs( title = "(Telemetry) Representations DETAIL",
        subtitle = "",
        x= "TASK", y = "count") + theme_minimal() 

# + theme(legend.position = "blank")



#ALL Representations by task/dataset + made interactive
gf_bar(  ~ REP, fill = ~IXN, data = df_telemetry) %>% 
  gf_facet_grid( DATASET ~ TASK) + coord_flip() + 
  theme_minimal() + labs(
    title = "ALL Representations by task/dataset + made interactive"
  )

gf_bar( PNUM ~., fill = ~ fct_rev(rep_type), data = df_telemetry) %>% 
  gf_facet_grid(.~TASK) + 
  scale_fill_brewer(type="qual", palette = 3) +
  labs(
    title = "Utterances by Participant, Dataset and Task",
    subtitle = "",
    x = "number of representations",
    y = "participant",
    fill = "TOPIC"
  ) + theme_minimal()

gf_bar( PNUM ~., fill = ~ fct_rev(DATASET), data = df_telemetry) %>% 
  gf_facet_grid(REP~.) + 
  scale_fill_brewer(type="qual", palette = 3) +
  labs(
    title = "Utterances by Participant, Dataset and Task",
    subtitle = "",
    x = "number of representations",
    y = "participant",
    fill = "TOPIC"
  ) + theme_minimal()

#ALL Representations by task/dataset + made interactive
gf_bar(  ~ REP, fill = ~rep_type, data = df_telemetry) %>% 
  gf_facet_grid( DATASET ~ TASK) + coord_flip() + 
  scale_fill_brewer(type="qual", palette = 1) +
  theme_minimal() + labs(
    title = "ALL Representations by task/dataset by REP TYPE"
  )

4.1.5 ADDING INTERACTIONS

#ADDED INTERACTION
df <- df_telemetry %>% filter(rep_type=="CHART")
gf_bar(~ IXN, fill = ~IXN,data = df %>% filter(TASK=="ixn")) %>% 
  gf_facet_grid(DATASET~REP) + 
  scale_fill_brewer(type="qual", palette = 2, direction = -1) +
  theme_minimal() + labs(
    title = "Where do participants add Interaction?",
    subtitle="(during the Interactive Task) which visualizations are made interactive?",
    caption = "TODO INVESTIGATE where is profiler seems really low \n added ixn here means they added an ixn technique",
    fill = "added ixn"
  )

df <- df_telemetry %>% filter(rep_type=="CHART")
gf_bar(~ REP, fill = ~IXN,data = df %>% filter(TASK=="ixn")) %>% 
  gf_facet_grid(DATASET~.) +
  coord_flip()+
  scale_fill_brewer(type="qual", palette = 2, direction = -1) +
  theme_minimal() + labs(
    title = "Where do participants add Interaction?",
    subtitle="(during the Interactive Task) which visualizations are made interactive?",
    caption = "TODO INVESTIGATE where is profiler seems really low \n added ixn here means they added an ixn technique",
    fill = "added ixn"
  )

4.1.6 TODO IXN TECHNIQUES

freq(df_telemetry$technique)
## Frequencies  
## df_telemetry$technique  
## Type: Factor  
## 
##                                                                     Freq   % Valid   % Valid Cum.   % Total   % Total Cum.
## ----------------------------------------------------------------- ------ --------- -------------- --------- --------------
##                                                      filter_brush     11      2.18           2.18      2.18           2.18
##                                        filter_brush+filter_slider      2      0.40           2.58      0.40           2.58
##          filter_brush+filter_slider+highlight_point+tooltip_hover      2      0.40           2.98      0.40           2.98
##                               filter_brush+filter_slider+pan_zoom      1      0.20           3.17      0.20           3.17
##                                                     filter_slider     11      2.18           5.36      2.18           5.36
##                                     filter_slider+highlight_brush      5      0.99           6.35      0.99           6.35
##       filter_slider+highlight_brush+highlight_point+tooltip_hover      2      0.40           6.75      0.40           6.75
##                       filter_slider+highlight_brush+tooltip_hover     10      1.98           8.73      1.98           8.73
##                       filter_slider+highlight_point+tooltip_hover      2      0.40           9.13      0.40           9.13
##                              filter_slider+pan_zoom+tooltip_hover      4      0.79           9.92      0.79           9.92
##                                                       filter_type      5      0.99          10.91      0.99          10.91
##                                                   highlight_brush     21      4.17          15.08      4.17          15.08
##                            highlight_brush+pan_zoom+tooltip_hover      1      0.20          15.28      0.20          15.28
##                                     highlight_brush+tooltip_hover     43      8.53          23.81      8.53          23.81
##                                                   highlight_color      0      0.00          23.81      0.00          23.81
##                                                   highlight_point      1      0.20          24.01      0.20          24.01
##                                     highlight_point+tooltip_hover      1      0.20          24.21      0.20          24.21
##                                                              none    337     66.87          91.07     66.87          91.07
##                                                          pan_zoom      4      0.79          91.87      0.79          91.87
##                                            pan_zoom+tooltip_hover     34      6.75          98.61      6.75          98.61
##                                                     tooltip_hover      7      1.39         100.00      1.39         100.00
##                                                              <NA>      0                               0.00         100.00
##                                                             Total    504    100.00         100.00    100.00         100.00
gf_bar(~technique, data = df_telemetry) + coord_flip()

4.1.7 TYPES by TASK

#TYPES by TASK
ctable(x = df_telemetry$rep_type, 
       y = df_telemetry$TASK, 
       prop = "t")   # Show row proportions

Cross-Tabulation, Total Proportions
rep_type * TASK
Data Frame: df_telemetry

TASK static ixn Total
rep_type
CHART 136 (27.0%) 211 (41.9%) 347 ( 68.8%)
TABLE 96 (19.0%) 61 (12.1%) 157 ( 31.2%)
Total 232 (46.0%) 272 (54.0%) 504 (100.0%)
#REPS by TASK
ctable(x = df_telemetry$REP, 
       y = df_telemetry$TASK, 
       prop = "t")   # Show row proportions

Cross-Tabulation, Total Proportions
REP * TASK
Data Frame: df_telemetry

TASK static ixn Total
REP
profile 25 ( 5.0%) 17 ( 3.4%) 42 ( 8.3%)
barplot 8 ( 1.6%) 35 ( 6.9%) 43 ( 8.5%)
columns 21 ( 4.2%) 9 ( 1.8%) 30 ( 6.0%)
dataframe 59 (11.7%) 42 ( 8.3%) 101 ( 20.0%)
describe 13 ( 2.6%) 7 ( 1.4%) 20 ( 4.0%)
heatmap 4 ( 0.8%) 3 ( 0.6%) 7 ( 1.4%)
hist 18 ( 3.6%) 1 ( 0.2%) 19 ( 3.8%)
info 3 ( 0.6%) 3 ( 0.6%) 6 ( 1.2%)
lineplot 13 ( 2.6%) 5 ( 1.0%) 18 ( 3.6%)
multiviewchart 3 ( 0.6%) 40 ( 7.9%) 43 ( 8.5%)
pairplot 4 ( 0.8%) 5 ( 1.0%) 9 ( 1.8%)
scatterplot 46 ( 9.1%) 105 (20.8%) 151 ( 30.0%)
stripplot 15 ( 3.0%) 0 ( 0.0%) 15 ( 3.0%)
Total 232 (46.0%) 272 (54.0%) 504 (100.0%)

4.1.8 TYPES by DATASET

#TYPES by DATASET
ctable(x = df_telemetry$rep_type, 
       y = df_telemetry$DATASET, 
       prop = "t")   

Cross-Tabulation, Total Proportions
rep_type * DATASET
Data Frame: df_telemetry

DATASET happiness space Total
rep_type
CHART 212 (42.1%) 135 (26.8%) 347 ( 68.8%)
TABLE 89 (17.7%) 68 (13.5%) 157 ( 31.2%)
Total 301 (59.7%) 203 (40.3%) 504 (100.0%)
#REPS by DATASET
ctable(x = df_telemetry$REP, 
       y = df_telemetry$DATASET, 
       prop = "t")   

Cross-Tabulation, Total Proportions
REP * DATASET
Data Frame: df_telemetry

DATASET happiness space Total
REP
profile 19 ( 3.8%) 23 ( 4.6%) 42 ( 8.3%)
barplot 19 ( 3.8%) 24 ( 4.8%) 43 ( 8.5%)
columns 16 ( 3.2%) 14 ( 2.8%) 30 ( 6.0%)
dataframe 60 (11.9%) 41 ( 8.1%) 101 ( 20.0%)
describe 10 ( 2.0%) 10 ( 2.0%) 20 ( 4.0%)
heatmap 4 ( 0.8%) 3 ( 0.6%) 7 ( 1.4%)
hist 4 ( 0.8%) 15 ( 3.0%) 19 ( 3.8%)
info 3 ( 0.6%) 3 ( 0.6%) 6 ( 1.2%)
lineplot 18 ( 3.6%) 0 ( 0.0%) 18 ( 3.6%)
multiviewchart 22 ( 4.4%) 21 ( 4.2%) 43 ( 8.5%)
pairplot 6 ( 1.2%) 3 ( 0.6%) 9 ( 1.8%)
scatterplot 120 (23.8%) 31 ( 6.2%) 151 ( 30.0%)
stripplot 0 ( 0.0%) 15 ( 3.0%) 15 ( 3.0%)
Total 301 (59.7%) 203 (40.3%) 504 (100.0%)

4.1.9 by PARTICIPANT — TOTAL

#REPS by PARTICPANT facet TASK
gf_bar( PNUM ~., fill = ~ DATASET, data = df_telemetry) %>% 
  gf_facet_grid(.~TASK) + 
  labs(
    title = "Telemetry Representations by Participant, Dataset and Task",
    subtitle = "",
    x = "number of REPRESENTATIONS",
    y = "participant",
    fill = "DATASET"
  ) + scale_fill_brewer(type="qual", palette = 2) + theme_minimal()

4.1.10 by PARTICIPANT — TYPE

#COUNT BY PARTICIPANT AND TASK
ctable(x = df_telemetry$PNUM, 
       y = df_telemetry$rep_type, 
       prop = "r")  

Cross-Tabulation, Row Proportions
PNUM * rep_type
Data Frame: df_telemetry

rep_type CHART TABLE Total
PNUM
P6 27 (62.8%) 16 (37.2%) 43 (100.0%)
P9 30 (73.2%) 11 (26.8%) 41 (100.0%)
P10 8 (24.2%) 25 (75.8%) 33 (100.0%)
P2 9 (47.4%) 10 (52.6%) 19 (100.0%)
P4 15 (48.4%) 16 (51.6%) 31 (100.0%)
P12 20 (74.1%) 7 (25.9%) 27 (100.0%)
P13 24 (77.4%) 7 (22.6%) 31 (100.0%)
P5 29 (74.4%) 10 (25.6%) 39 (100.0%)
P7 13 (86.7%) 2 (13.3%) 15 (100.0%)
P8 52 (80.0%) 13 (20.0%) 65 (100.0%)
P3 80 (82.5%) 17 (17.5%) 97 (100.0%)
P1 32 (66.7%) 16 (33.3%) 48 (100.0%)
P11 8 (53.3%) 7 (46.7%) 15 (100.0%)
Total 347 (68.8%) 157 (31.2%) 504 (100.0%)
#REPS by PARTICPANT facet TASK
gf_bar( PNUM ~., fill = ~ fct_rev(rep_type), data = df_telemetry) %>% 
  gf_facet_grid(DATASET~TASK) + 
  labs(
    title = "Telemetry Representations by Participant, Dataset and Task",
    subtitle = "",
    x = "number of REPRESENTATIONS",
    y = "participant",
    fill = "REP-TYPE"
  ) + scale_fill_brewer(type="qual", palette = 1, direction = -1) + theme_minimal()

4.1.11 TODO timestamps on telemetry needs cleaning

# #DOTPLOT
# ggplot(df_coded, aes(x=relative_time, y = PNUM)) + 
#   geom_point(alpha=0.5, size=3) +
#   facet_grid(df_coded$TASK) +
#   scale_color_brewer(type="qual", palette = 3) +
#   theme_minimal() + labs(
#     title = "Participant Utterances over timecourse of Task",
#     x= "timecourse of task (seconds)", y = "Participant",
#     color = "Topic"
#   ) 
# 
# 
# #HISTOGRAMS BY TASK
# ggplot(df_coded, aes(x = relative_time)) + 
#   geom_histogram(binwidth = 30,aes(y=..density..)) + 
#   geom_density()+
#   facet_grid(df_coded$TASK) +
#   theme_minimal() + labs(
#     title = "Participant Utterances over timecourse of Task",
#     x= "timecourse of task (seconds)", y = "frequency of utterances",
#   ) + theme_minimal() + theme(legend.position = "blank")

4.2 [Utterance] Representations

THIS SECTION covers representations EXPLICITLY LINKED to UTTERANCES. Does not include ALL representations generated, but rather, what representations were being used when the participant generated utterances.

4.2.1 by TASK OR DATASET

#PROPS IN PAPER
freq(df_joined$rep_type, 
     cumul      = FALSE,
     headings   = FALSE,
     report.nas = FALSE,
     plain.ascii = FALSE) 
  Freq %
CHART 455 59.87
CODE 262 34.47
NONE 43 5.66
Total 760 100.00
#COUNT BY TASK
ctable(x = df_joined$rep_type, 
       y = df_joined$TASK, 
       prop = "t")  

Cross-Tabulation, Total Proportions
rep_type * TASK
Data Frame: df_joined

TASK static ixn Total
rep_type
CHART 229 (30.1%) 226 (29.7%) 455 ( 59.9%)
CODE 165 (21.7%) 97 (12.8%) 262 ( 34.5%)
NONE 24 ( 3.2%) 19 ( 2.5%) 43 ( 5.7%)
Total 418 (55.0%) 342 (45.0%) 760 (100.0%)
#DF SUMMARIZED BY TASK + DATASET
df_summary <- df_joined %>% 
  group_by(rep_type, TASK) %>% 
  dplyr::summarise(
    c = n()
  )

#STACKED BAR BY TASK
ggplot(df_summary, aes(x = TASK, y=c, fill= fct_rev(rep_type))) + 
  geom_col() + 
  geom_text(aes(label=c), size = 3, hjust = 0.5, vjust = 1.5, position = "stack") + 
  scale_fill_brewer(type="qual", palette = 4) +
  labs( title = "UTTERANCE-REPS by TASK",
        subtitle = "",
        x= "TASK", y = "count", fill="TOPIC") + theme_minimal() 

# + theme(legend.position = "blank")

#COUNT BY DATASET
ctable(x = df_joined$rep_type, 
       y = df_joined$DATASET, 
       prop = "t")  

Cross-Tabulation, Total Proportions
rep_type * DATASET
Data Frame: df_joined

DATASET happiness space Total
rep_type
CHART 299 (39.3%) 156 (20.5%) 455 ( 59.9%)
CODE 115 (15.1%) 147 (19.3%) 262 ( 34.5%)
NONE 27 ( 3.6%) 16 ( 2.1%) 43 ( 5.7%)
Total 441 (58.0%) 319 (42.0%) 760 (100.0%)
#DF SUMMARIZED BY TASK + DATASET
df_summary <- df_joined %>% 
  group_by(rep_type, DATASET) %>% 
  dplyr::summarise(
    c = n()
  )

#STACKED BAR BY DATASET
ggplot(df_summary, aes(x = DATASET, y=c, fill= fct_rev(rep_type))) + 
  geom_col() + 
  geom_text(aes(label=c), size = 3, hjust = 0.5, vjust = 1.5, position = "stack") + 
  scale_fill_brewer(type="qual", palette = 4) +
  labs( title = "UTTERANCE-REPS by DATASET",
        subtitle = "",
        x= "DATASET", y = "count", fill="TOPIC") + theme_minimal() 

# + theme(legend.position = "blank")

4.2.2 by TASK and DATASET

#DF SUMMARIZED BY TASK + DATASET
df_summary <- df_joined %>%
  group_by(rep_type, TASK,DATASET) %>%
  dplyr::summarise(
    c = n()
  )

#STACKED BAR BY TASK FACET DATASET
ggplot(df_summary, aes(x = TASK, y=c, fill= fct_rev(rep_type))) +
  facet_wrap(df_summary$DATASET) +
  geom_col() +
  geom_text(aes(label=c), size = 3, hjust = 0.5, vjust = 1.5, position = "stack") +
  scale_fill_brewer(type="qual", palette = 4) +
  labs( title = "UTTERANCE-REPS by TASK and DATASET",
        subtitle = "",
        x= "TASK", y = "count", fill="TOPIC") + theme_minimal()

# + theme(legend.position = "blank")

4.2.3 by PARTICIPANT

#COUNT BY PARTICIPANT 
ctable(x = df_joined$PNUM, 
       y = df_joined$rep_type, 
       prop = "r")  

Cross-Tabulation, Row Proportions
PNUM * rep_type
Data Frame: df_joined

rep_type CHART CODE NONE Total
PNUM
P6 16 (55.2%) 13 (44.8%) 0 ( 0.0%) 29 (100.0%)
P9 59 (55.1%) 39 (36.4%) 9 ( 8.4%) 107 (100.0%)
P10 33 (76.7%) 7 (16.3%) 3 ( 7.0%) 43 (100.0%)
P2 20 (71.4%) 7 (25.0%) 1 ( 3.6%) 28 (100.0%)
P4 18 (28.6%) 43 (68.3%) 2 ( 3.2%) 63 (100.0%)
P12 45 (51.7%) 29 (33.3%) 13 (14.9%) 87 (100.0%)
P13 66 (75.0%) 20 (22.7%) 2 ( 2.3%) 88 (100.0%)
P5 69 (79.3%) 17 (19.5%) 1 ( 1.1%) 87 (100.0%)
P7 13 (27.1%) 31 (64.6%) 4 ( 8.3%) 48 (100.0%)
P8 40 (78.4%) 9 (17.6%) 2 ( 3.9%) 51 (100.0%)
P3 42 (77.8%) 7 (13.0%) 5 ( 9.3%) 54 (100.0%)
P1 19 (73.1%) 6 (23.1%) 1 ( 3.8%) 26 (100.0%)
P11 15 (30.6%) 34 (69.4%) 0 ( 0.0%) 49 (100.0%)
Total 455 (59.9%) 262 (34.5%) 43 ( 5.7%) 760 (100.0%)
#TOPICS by PARTICPANT facet TASK
(p <- gf_bar( PNUM ~., fill = ~ fct_rev(rep_type), data = df_joined) %>% 
  gf_facet_grid(DATASET~TASK) + 
  scale_fill_brewer(type="qual", palette = 4) +
  labs(
    title = "Utterance-Represenations by Participant, Dataset and Task",
    subtitle = "",
    x = "number of representations",
    y = "participant",
    fill = "TOPIC"
  ) + theme_minimal())

# ggsave(p, file="figures/utterance_reptypes_by_count.png")

4.2.4 by TIME

#HISTOGRAMS BY TASK
ggplot(df_joined, aes(x = relative_time)) + 
  geom_histogram(binwidth = 30,aes(y=..density.., fill = fct_rev(rep_type), color = fct_rev(rep_type))) + 
  geom_density()+
  facet_grid(df_joined$rep_type ~ df_joined$TASK) +
  scale_fill_brewer(type="qual", palette = 4) +
  scale_color_brewer(type="qual", palette = 4) +
  theme_minimal() + labs(
    title = "UTTERANCE-REPS over timecourse of Task",
    x= "timecourse of task (seconds)", y = "frequency of utterance-reps",
    fill = "Topic"
  ) + theme_minimal() + theme(legend.position = "blank")

#DOTPLOT — FACET TASK
(p <- ggplot(df_joined, aes(x=relative_time, y = PNUM, color=fct_rev(rep_type))) + 
  geom_point(alpha=0.5, size=3) +
  facet_grid(df_joined$TASK) +
  scale_color_brewer(type="qual", palette = 4) +
  theme_minimal() + labs(
    title = "Utterance-Reps over timecourse of Task",
    x= "timecourse of task (seconds)", y = "Task",
    color = "Topic"
  )) 

ggsave(p, file="figures/UTTREP_classes_by_time_FACET.png")
## Saving 7 x 5 in image
#DOTPLOT STACKED TASKS
(p <- ggplot(df_joined, aes(x=relative_time, y = fct_rev(TASK), color=fct_rev(rep_type))) + 
  geom_point(alpha=0.5, size=3) +
  facet_grid(df_joined$PNUM) +
  # facet_grid(df_joined$TASK ~ df_joined$DATASET) +
  scale_color_brewer(type="qual", palette = 4) +
  theme_minimal() + labs(
    title = "Utterance-Reps over timecourse of Task",
    x= "timecourse of task (seconds)", y = "Participant",
    color = "Topic"
)) 

ggsave(p, file="figures/UTTREP_classes_by_time_STACK.png")
## Saving 7 x 5 in image

4.3 [Enacted] Representations [TODO PRIORITY]

THIS SECTION would include the (very) small number of UTTERANCE-REPS that are flagged as occuring when interaction was ACTIVELY BEING USED SHOULD be a quick but effective.

#DOTPLOT
df <- df_joined %>% filter(TASK=="ixn")
ggplot(df, aes(x=relative_time, y = fct_rev(TASK), color=fct_rev(ixn))) + 
  geom_point(alpha=0.5, size=3) +
  facet_grid(df$PNUM) +
  # facet_grid(df_joined$TASK ~ df_joined$DATASET) +
  scale_color_brewer(type="qual", palette = 4) +
  theme_minimal() + labs(
    title = "Utterance USING ixn during TASK",
    x= "timecourse of task (seconds)", y = "Participant",
    color = "IXV"
) 

#DOTPLOT—BW
ggplot(df_coded, aes(x=relative_time, y = PNUM)) + 
  geom_point(alpha=0.5, size=3) +
  facet_grid(df_coded$TASK) +
  scale_color_brewer(type="qual", palette = 3) +
  theme_minimal() + labs(
    title = "Participant Utterances over timecourse of Task",
    x= "timecourse of task (seconds)", y = "Participant",
    color = "Topic"
  ) 

#DF SUMMARIZED BY TASK + DATASET
df_summary <- df_joined %>%
  filter(TASK == "ixn") %>%  #can only occur during interactive task
  group_by(DATASET,code_topic,ixn) %>% 
  dplyr::summarise( .groups="keep",
    c = n()
  )

##no need to summarize by rep_type because only vis can be ixn
## consider middle level summary of uni vs bi vs multivariate vis 


#STACKED BAR BY TASK
ggplot(df_summary, aes(x = DATASET, y=c, fill= ixn)) + 
  geom_col() + 
  facet_wrap(df_summary$code_topic) + 
  geom_text(aes(label=c), size = 3, hjust = 0.5, vjust = 1.5, position = "stack") + 
  # scale_fill_brewer(type="qual", palette = 4) +
  labs( title = "KINDS of utterances made WHILE INTERACTING with ixn visualization",
        subtitle = "TODO ZOOM IN ON THIS",
        x= "DATASET", y = "count") + theme_minimal() 

# + theme(legend.position = "blank")
df <- df_joined %>% dplyr::select(PNUM, TASK, DATASET, code_topic, code_detail, REP, rep_type, relative_time, ixn) %>% filter(TASK=="ixn")


gf_bar( ~REP , fill = ~ixn, data = df) %>% 
  gf_facet_grid(ixn ~ DATASET) + theme_minimal() + 
  labs( title = "Utterances WHILE INTERACTING with IXN representations")

df_summary <- df_joined %>% 
  filter(TASK == "ixn") %>%  #can only occur during interactive task
  group_by(DATASET,REP,ixn) %>% 
  summarise( .groups = "keep",
    c = n()
  )

#STACKED BAR BY TASK FACET DATASET
ggplot(df_summary, aes(x = REP, y=c, fill= ixn)) +
  facet_wrap(df_summary$DATASET) +
  geom_col() +
  geom_text(aes(label=c), size = 3, hjust = 0.5, vjust = 1.5, position = "stack") +
  scale_fill_brewer(type="qual", palette = 4) +
  labs( title = "Utterances WHILE INTERACTING with IXN representations",
        subtitle = "",
        x= "", y = "count", fill="interacting?") + theme_minimal()

# + theme(legend.position = "blank")

5 MODELLING

#DEFINE DATAFRAME
df <- df_coded %>% select(pid, uid, TASK, DATASET) 
  
# #MOSAIC PLOT
# mosaic(formula = ~DATASET + TASK, 
#        data = df,
#        main = "Proportion of Utterances by TASK and DATASET", 
#        sub = "u = 734 utterance-codes",
#        labeling = labeling_values,
#        labeling_args = list(set_varnames = c(graph = "TASK",
#                             datset = "DATASET")))

5.1 Predicting NUMBER of UTTERANCES

How much variance in number of utterances is explained DATASET, TASK and PARTICIPANT?

5.1.1 OLS Mixed Effects Model

#DEFINE DATAFRAME
df <- df_coded %>% group_by(pid, DATASET, TASK) %>% 
  dplyr::summarise( .groups = "keep",
    n_utterances = n()
  )

#NUMBER UTTERANCES predicted by DATASET + TASK | participatnt--> MIXED LINEAR REGRESSION
print("LMER, UTTERANCES ~ DATASET + TASK")
## [1] "LMER, UTTERANCES ~ DATASET + TASK"
mm1 <- lmer(n_utterances ~ DATASET + TASK+ (1|pid), data = df)
paste("Model")
## [1] "Model"
summ(mm1)
Observations 26
Dependent variable n_utterances
Type Mixed effects linear regression
AIC 198.45
BIC 204.74
Pseudo-R² (fixed effects) 0.12
Pseudo-R² (total) 0.68
Fixed Effects
Est. S.E. t val. d.f. p
(Intercept) 35.11 4.17 8.42 20.26 0.00
DATASETspace -8.90 3.33 -2.67 11.00 0.02
TASKixn -4.24 3.33 -1.27 11.00 0.23
p values calculated using Satterthwaite d.f.
Random Effects
Group Parameter Std. Dev.
pid (Intercept) 11.12
Residual 8.47
Grouping Variables
Group # groups ICC
pid 13 0.63
paste("Partition Variance")
## [1] "Partition Variance"
anova(mm1)
## Type III Analysis of Variance Table with Satterthwaite's method
##         Sum Sq Mean Sq NumDF DenDF F value Pr(>F)  
## DATASET 512.37  512.37     1    11  7.1424 0.0217 *
## TASK    116.06  116.06     1    11  1.6179 0.2296  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
paste("Confidence Interval on Parameter Estimates")
## [1] "Confidence Interval on Parameter Estimates"
confint(mm1)
## Computing profile confidence intervals ...
##                   2.5 %    97.5 %
## .sig01         5.839640 18.047250
## .sigma         5.540684 12.104462
## (Intercept)   26.986050 43.233730
## DATASETspace -15.384322 -2.425202
## TASKixn      -10.717656  2.241465
report(mm1) #sanity check
## We fitted a linear mixed model (estimated using REML and nloptwrap optimizer)
## to predict n_utterances with DATASET and TASK (formula: n_utterances ~ DATASET
## + TASK). The model included pid as random effect (formula: ~1 | pid). The
## model's total explanatory power is substantial (conditional R2 = 0.68) and the
## part related to the fixed effects alone (marginal R2) is of 0.12. The model's
## intercept, corresponding to DATASET = happiness and TASK = static, is at 35.11
## (95% CI [26.44, 43.78], t(21) = 8.42, p < .001). Within this model:
## 
##   - The effect of DATASET [space] is statistically significant and negative (beta
## = -8.90, 95% CI [-15.83, -1.98], t(21) = -2.67, p = 0.014; Std. beta = -0.61,
## 95% CI [-1.09, -0.14])
##   - The effect of TASK [ixn] is statistically non-significant and negative (beta
## = -4.24, 95% CI [-11.17, 2.69], t(21) = -1.27, p = 0.217; Std. beta = -0.29,
## 95% CI [-0.77, 0.19])
## 
## Standardized parameters were obtained by fitting the model on a standardized
## version of the dataset. 95% Confidence Intervals (CIs) and p-values were
## computed using a Wald t-distribution approximation.
plot_model(mm1,  show.intercept = TRUE)

check_model(mm1)

#NUMBER UTTERANCES predicted by DATASET * TASK  | participatnt--> MIXED LINEAR REGRESSION
print("LMER, UTTERANCES ~ DATASET X TASK")
## [1] "LMER, UTTERANCES ~ DATASET X TASK"
mm2 <- lmer(n_utterances ~ DATASET * TASK + (1|pid), data = df)
paste("Model")
## [1] "Model"
summ(mm2)
Observations 26
Dependent variable n_utterances
Type Mixed effects linear regression
AIC 192.74
BIC 200.29
Pseudo-R² (fixed effects) 0.14
Pseudo-R² (total) 0.70
Fixed Effects
Est. S.E. t val. d.f. p
(Intercept) 37.57 5.37 7.00 15.55 0.00
DATASETspace -14.24 7.90 -1.80 15.55 0.09
TASKixn -9.57 7.90 -1.21 15.55 0.24
DATASETspace:TASKixn 10.67 14.32 0.74 11.00 0.47
p values calculated using Satterthwaite d.f.
Random Effects
Group Parameter Std. Dev.
pid (Intercept) 11.39
Residual 8.47
Grouping Variables
Group # groups ICC
pid 13 0.64
paste("Partition Variance")
## [1] "Partition Variance"
anova(mm2)
## Type III Analysis of Variance Table with Satterthwaite's method
##              Sum Sq Mean Sq NumDF DenDF F value Pr(>F)  
## DATASET      512.37  512.37     1    11  7.1424 0.0217 *
## TASK         116.06  116.06     1    11  1.6179 0.2296  
## DATASET:TASK  39.80   39.80     1    11  0.5549 0.4720  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
paste("Confidence Interval on Parameter Estimates")
## [1] "Confidence Interval on Parameter Estimates"
confint(mm2)
## Computing profile confidence intervals ...
##                           2.5 %    97.5 %
## .sig01                 5.468708 17.569089
## .sigma                 5.540710 12.104731
## (Intercept)           27.350579 47.792279
## DATASETspace         -29.282779  0.806589
## TASKixn              -24.616112  5.473256
## DATASETspace:TASKixn -17.180459 38.513793
report(mm2) #sanity check
## We fitted a linear mixed model (estimated using REML and nloptwrap optimizer)
## to predict n_utterances with DATASET and TASK (formula: n_utterances ~ DATASET
## * TASK). The model included pid as random effect (formula: ~1 | pid). The
## model's total explanatory power is substantial (conditional R2 = 0.70) and the
## part related to the fixed effects alone (marginal R2) is of 0.14. The model's
## intercept, corresponding to DATASET = happiness and TASK = static, is at 37.57
## (95% CI [26.38, 48.76], t(20) = 7.00, p < .001). Within this model:
## 
##   - The effect of DATASET [space] is statistically non-significant and negative
## (beta = -14.24, 95% CI [-30.71, 2.24], t(20) = -1.80, p = 0.086; Std. beta =
## -0.98, 95% CI [-2.11, 0.15])
##   - The effect of TASK [ixn] is statistically non-significant and negative (beta
## = -9.57, 95% CI [-26.04, 6.90], t(20) = -1.21, p = 0.240; Std. beta = -0.66,
## 95% CI [-1.79, 0.47])
##   - The effect of DATASET [space] × TASK [ixn] is statistically non-significant
## and positive (beta = 10.67, 95% CI [-19.20, 40.54], t(20) = 0.74, p = 0.465;
## Std. beta = 0.73, 95% CI [-1.32, 2.79])
## 
## Standardized parameters were obtained by fitting the model on a standardized
## version of the dataset. 95% Confidence Intervals (CIs) and p-values were
## computed using a Wald t-distribution approximation.
plot_model(mm2,  show.intercept = TRUE)

check_model(mm2)

### POISSON Mixed Effects Models
### ARF poisson is recommended over OLS regression for count data 
### BUT they are challenging to interpret (log odds) and the estimates need to be translated (logodds?)
### NOT innapropriate to use OLS instead
# 
# #NUMBER UTTERANCES predicted by TASK + DATASET  | participatnt--> POISSON MIXED LINEAR REGRESSION
# print("POISSON-MER, UTTERANCES ~ DATASET + TASK")
# pmm1 <- glmer(n_utterances ~ TASK + DATASET + (1|pid), data = df, family = "poisson")
# paste("Model")
# summ(pmm1)
# paste("Partition Variance")
# anova(pmm1)
# paste("Confidence Interval on Parameter Estimates")
# confint(pmm1)
# report(pmm1) #sanity check
# plot_model(pmm1,  show.intercept = TRUE)
# check_model(pmm1)
# 
# #NUMBER UTTERANCES predicted by TASK X DATASET  | participatnt--> POISSON MIXED LINEAR REGRESSION
# print("POISSON-MER, UTTERANCES ~ DATASET X TASK")
# pmm2 <- glmer(n_utterances ~ TASK * DATASET + (1|pid), data = df, family = "poisson")
# paste("Model")
# summ(pmm2)
# paste("Partition Variance")
# anova(pmm2)
# paste("Confidence Interval on Parameter Estimates")
# confint(pmm2)
# report(pmm2) #sanity check
# plot_model(pmm2,  show.intercept = TRUE)
# check_model(pmm2)